# coding=utf-8
from locale import *
import os
import sys
import datetime
from selenium import webdriver
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import  requests
import threading
import time
from bs4 import BeautifulSoup

site_host = 'https://www.juromeo.com'


def img_download(img, sku):
    img_name = img[img.rindex("/")+1:]
    img_dir = 'C:\\juromeo\\'+category_name+'\\'+sku+'\\'
    os.makedirs(img_dir, exist_ok=True)
    img_path = img_dir+img_name
    print(img_path + '------%s' % threading.current_thread())
    r = requests.get(img)
    with open(img_path, 'wb') as f:
        f.write(r.content)

def run_crawler(url, open_file):
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument("--test-type")
    prefs = {"profile.managed_default_content_settings.images": 2}
    options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(chrome_options=options)

    driver.get(url)
    current_list_url = driver.current_url
    print('CURRENT CATEGORY URL:' + current_list_url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    # 获取当前产品列表页的每个产品链接
    allItems = soup.select('div.item-content')
    # 打开文件
    file = open(open_file, 'a', encoding='utf-8')
    for item in allItems:
        # 得到产品详情页url
        item_url = site_host + item.select_one('a').get('href')
        print(item_url)
        # 进入产品详情页
        driver.get(item_url.strip())
        # 解析产品详情页内容
        soup_product = BeautifulSoup(driver.page_source, "html.parser")
        # 标题
        title = soup_product.select_one('div.product-name > h1').get_text().strip()
        # sku
        sku = soup_product.select_one('span.s-sku-product').get_text().strip()
        # 处理图片
        all_imgs = soup_product.select("ul.swiper-wrapper > li > img")
        for img in all_imgs:
            img_url = img.get('data-original')
            print(img_url)
            # img_download(img_url, sku)
            thread = threading.Thread(target=img_download, args=(img_url, sku))
            thread.start()
        # 现价
        price = soup_product.select_one('#item-price > span.price-value').get_text().strip()
        # 原价
        old_price = soup_product.select_one('#item-old-price > span.price-value').get_text().strip()
        # 自定义选项（拿全部html内容）
        personalize_options = soup_product.select_one('#personalize-input').get_text().strip().replace('\t', ' ').replace('\n', '')
        # 描述
        description = soup_product.select_one('#Description > div > div.std').get_text().strip().replace('\t', ' ').replace('\n', '')
        #发货和物流
        shipping = soup_product.select_one('div.productShipping').get_text().strip().replace('\t', ' ').replace('\n', '')
        # 属性
        product_detail_li = soup_product.select_one('#product-attribute-specs-table').select('li')
        i = 0
        product_detail_txt = ''
        for li in product_detail_li:
            i += 1
            if i % 2 == 0:
                product_detail_txt += (li.get_text()+'|||')
            else:
                product_detail_txt += (li.get_text()+'==')
        print(item_url + '\t' + title + '\t' + sku + '\t' + price + '\t' + old_price + '\t' + product_detail_txt + '\t' + personalize_options + '\t' + description + '\t' + shipping + '\n')
        file.writelines(item_url + '\t' + title + '\t' + sku + '\t' + price + '\t' + old_price + '\t' + product_detail_txt + '\t' + personalize_options  + '\t' + description + '\t' + shipping + '\n')
        # 等待5秒再执行下一产品
        time.sleep(3)
    file.close()


category_name = input("Input File Name:").strip()
file_name = category_name+".xls"
print("File Name is:", file_name)

input_url = input("Input Category Url:").strip()
print("Category Url is:", input_url)

run_crawler(input_url, file_name)







